beautifulsoup

html和xml解析

第三方库

https://www.crummy.com/software/BeautifulSoup/

pip install beautifulsoup4

简单示例

html ='''<html>
<head><title>我的第一个 HTML 页面</title></head>
<body>
<p>body 元素的内容会显示在浏览器中。</p>
<p class="note">title 元素的内容会显示在浏览器的标题栏中。</p>
</body></html>'''

from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify())

Beautiful Soup库解析器

解析器	使用方法	条件	优势	劣势
bs4的HTML解析器	BeautifulSoup(mk,’html.parser’)	安装bs4库	python内置标准库，执行速度适中，文档容错能力强	py2.7.3 和py3.2.2前的版主中文容错能力差
lxml的HTML解析器	BeautifulSoup(mk, ‘lxml’)	pip install lxml	速度快，文档容错能力强	需要安装C语言库
lxml的xml解析器	BeautifulSoup(mk,’xml’)	pip install lxml	速度快，唯一支持xml的解析库	需要安装C语言库
html5lib的解析器	BeautifulSoup(mk, ‘html5lib’)	pip install html5lib	容错性最好，以浏览器的方式解析文档，生成HTML5格式的文档，不依赖外部扩展	速度慢

BeautifulSoup类的基本元素

基本元素	说明
Tag	标签，最基本的信息组织单元，分别用<>和</>标明开始和结束
Name	标签的名字，\ \ 的名字是p，格式：\.name
Attributes	标签的属性，字典形式组织，格式：\.attrs
NavigableString	标签内非属性字符串，<>…</>中字符串，格式：\.string
Comment	标签内字符串的注释部分，一种特殊的Comment类型

# 简单例子
import requests
from bs4 import BeautifulSoup

url = 'https://www.crummy.com/software/BeautifulSoup/'
try:
    r = requests.get(url)
    r.raise_for_status()
except:
    print('网络资源错误')

soup = BeautifulSoup(r.text, 'html.parser')
print(soup.title)
'''
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
'''
print(soup.a, type(soup.a)) # 多个tag名相同时，获取第一个
'''
<a href="bs4/download/"><h1>Beautiful Soup</h1></a> <class 'bs4.element.Tag'>
'''
print(soup.a.name)
'''
a
'''
print(soup.a.attrs)
'''
{'href': 'bs4/download/'}
'''
print(soup.a.attrs['href'])
'''
bs4/download/
'''
print(soup.a.string)
'''
Beautiful Soup
'''

newsoup = BeautifulSoup('<b><!--This is a comment--></b><p>This is not a comment</p>', 'html.parser')
print(newsoup.b.string, type(newsoup.b.string))
'''
This is a comment <class 'bs4.element.Comment'>
'''

print(newsoup.p.string, type(newsoup.p.string))
'''
This is not a comment <class 'bs4.element.NavigableString'>
'''

遍历DOM树

下行遍历：根到叶

# 简单例子
import requests
from bs4 import BeautifulSoup

url = 'https://www.crummy.com/software/BeautifulSoup/'
try:
    r = requests.get(url)
    r.raise_for_status()
except:
    print('网络资源错误')

soup = BeautifulSoup(r.text, 'html.parser')

head = soup.head
print(head.contents)
'''
['\n', <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>, '\n', <title>Beautiful Soup: We called him Tortoise because he taught us.</title>, '\n', <link href="mailto:leonardr@segfault.org" rev="made"/>, '\n', <link href="/nb/themes/Default/nb.css" rel="stylesheet" type="text/css"/>, '\n', <meta content="Beautiful Soup: a library designed for screen-scraping HTML and XML." name="Description"/>, '\n', <meta content="Markov Approximation 1.4 (module: leonardr)" name="generator"/>, '\n', <meta content="Leonard Richardson" name="author"/>, '\n']
'''
print(head.contents[3])
'''
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
'''

1
2
3

# 某标签的下行遍历
for child in soup.tag.children:
    print(child.name)

上行遍历：叶到根

| 属性 | 说明 |
| ——– | ———————————————- |
| .parent | 节点的父亲标签 |
| .parents | 节点的先辈标签的迭代类型，用于循环遍历先辈节点 |

# 属性示例
import requests
from bs4 import BeautifulSoup

url = 'https://www.crummy.com/software/BeautifulSoup/'
try:
    r = requests.get(url)
    r.raise_for_status()
except:
    print('网络资源错误')

soup = BeautifulSoup(r.text, 'html.parser')
print(soup.a.parent)
'''
<div align="center">
<a href="bs4/download/"><h1>...</h1></a><p>...</p>
</div>
'''
print(soup.a.parents)
'''
<generator object PageElement.parents at 0x00000210B2B4C480>
'''
print([f.name for f in soup.a.parents])
'''
['div', 'body', 'html', '[document]']
'''

print(soup.html.parent.name,type(soup.html.parent.name))
'''
[document] <class 'str'>
'''

1
2
3

# 某标签的上行遍历
for p in soup.tag.parents:
    print(p.name)

平行遍历：兄弟节点（同一个父标签）间遍历

# 属性示例
import requests
from bs4 import BeautifulSoup

url = 'https://www.crummy.com/software/BeautifulSoup/'
try:
    r = requests.get(url)
    r.raise_for_status()
except:
    print('网络资源错误')

soup = BeautifulSoup(r.text, 'html.parser')
print(soup.a.next_sibling.next_sibling)
'''
<p>"A tremendous boon." -- Python411 Podcast</p>
'''

1
2
3

# # 某标签的平行遍历
for sibling in soup.a.next_siblings:
    print(sibling.name)

查找中所有指定标签

soup.find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)

li = tag(another_tag)

name可以是单个标签名（支持正则表达式），也可以是多个标签名组成的列表。为True时搜索所有标签。
对标签属性值的检索字符串，可标注属性检索。
**kwargs为可约束对象，如id,class,string等，他们的值可以是正则表达式模式
recursive 是否对子孙节点进行检索

# 找出页面内所有URL链接
import requests
from bs4 import BeautifulSoup

url = 'https://www.crummy.com/software/BeautifulSoup/'
try:
    r = requests.get(url)
    r.raise_for_status()
except:
    print('网络资源错误')

soup = BeautifulSoup(r.text, 'html.parser')
for link in soup.find_all('a'):
    print(link.get('href'))

'''
bs4/download/
#Download
bs4/doc/
#HallOfFame
https://code.launchpad.net/beautifulsoup
https://bazaar.launchpad.net/%7Eleonardr/beautifulsoup/bs4/view/head:/CHANGELOG
https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup
zine/
https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=website
zine/
https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup
https://bugs.launchpad.net/beautifulsoup/
http://lxml.de/
http://code.google.com/p/html5lib/
bs4/doc/
None
bs4/download/
http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
download/3.x/BeautifulSoup-3.2.1.tar.gz
None
http://www.nytimes.com/2007/10/25/arts/design/25vide.html
https://github.com/reddit/reddit/blob/85f9cff3e2ab9bb8f19b96acd8da4ebacc079f04/r2/r2/lib/media.py
http://www.harrowell.org.uk/viktormap.html
http://svn.python.org/view/tracker/importer/
http://www2.ljworld.com/
http://www.b-list.org/weblog/2010/nov/02/news-done-broke/
http://esrl.noaa.gov/gsd/fab/
http://laps.noaa.gov/topograbber/
http://groups.google.com/group/beautifulsoup/
https://launchpad.net/beautifulsoup
https://code.launchpad.net/beautifulsoup/
https://bugs.launchpad.net/beautifulsoup/
/source/software/BeautifulSoup/index.bhtml
/self/
/self/contact.html
http://creativecommons.org/licenses/by-sa/2.0/
http://creativecommons.org/licenses/by-sa/2.0/
http://www.crummy.com/
http://www.crummy.com/software/
http://www.crummy.com/software/BeautifulSoup/
'''

实例

1 2	# 笔趣阁某小说的某章节 url = 'https://www.biquge.info/10_10582/10237602.html'

import requests
from bs4 import BeautifulSoup

try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
except:
print(‘网络资源错误’)

soup = BeautifulSoup(r.text, ‘html.parser’)

title = soup.h1.string
content = ‘’.join(soup.find(‘div’, id=’content’).strings)

print(title)
print()
print(content)
`